* THIS IS THE FILE THAT TRIMS THE DATA OF UNRELIABLE VALUES.  THIS IS TAKEN FROM THE SCHARR WORK p33 OF THE MODELLING
* Very little is changed in this - only the descriptive stats (to see how many affected), and some illustrative GHS analysis with 1 outlier removed
* 25/1/2009


version 9.0
set more off

set mem 500m
doedit "E:\Current work\CSR\Data analysis\working.do"		//	This simply means you can load up the do file by double-clicking
adopath + "E:\Current work\CSR\Data analysis"
* net from http://www.indiana.edu/~jslsoc/stata			//  Installs the SPost package, mainly here for MISSCHK which is just convenient...
* net install spost9_ado

capture log close
log using "E:\Current work\CSR\Data analysis\Aug_08", append text
exit

/* GENERAL NOTES
	- Output initially used the csr.ado command created in early 2007
	- More recently has been replaced by a series of commands using the following codes, followed by R for reduced to limit and 0 for looking at the
	amount of consumption done in this way (e.g. a0 is the command for looking at the total amount currently drunk over daily limits):
		a = Daily limits (4/3)
		b = Binge-drinking limits (8/6)
		c = Weekly limits (21/14)
		d = Nadir (White)
		e = Drink-driving


******************************************************************************************************************************************************
******************************************************************************************************************************************************
* DESCRIPTIVE STATS **********************************************************************************************************************************
******************************************************************************************************************************************************
******************************************************************************************************************************************************

* NDNS
cd "Z:\IAS STATA files\NDNS 19-64\stata6"
use allalc.dta, replace
gen drinkers = 0 if alcundr<.
replace drinkers = 1 if alcundr>0 & alcundr<.

svyset [pweight=diarywgt]
svy: mean respage respsex drinkers alcundr if alcundr<.
sum respage, det
sum alcundr, det

// To see if there are any outliers - which there aren't...
sum alcundr, det
local i = 1
while `i' <= 7	{
			sum alcuni`i', det
			local i = `i' + 1
			}

// Generating distribution to compare to GHS
_pctile alcundr [pweight=diarywgt] if respage>=19 & respage<65, percentiles (1 (1) 99 99.9)
local i = 1
// display "Percentage from bottom"		_col(20) "Units per week"
while `i' <= 99	{
			display "`i'%" 		_col(20) r(r`i')
			local i = `i' + 1
			}
if `i' == 100	{
			display "Bottom 99.9%" 	_col(20) r(r`i')
			}


* GHS
cd "Z:\IAS STATA files\GHS 2000-1\stata8"
use working.dta, replace
gen drinkers = 0 if drating<.
replace drinkers = 1 if drating>0 & drating<.

svyset _n [pweight=weight00]
svy: mean drating if age==18
svy: mean drating if age>=65
svy: mean age sex drinkers drating if drating<.
sum age if drating<., det
sum drating, det

gen dratingT = drating
replace dratingT = . if drating >300	// Only one value is changed 

// Generating distribution to compare to NDNS
_pctile dratingT [pweight=weight00] if age>=19 & age<65, percentiles (1 (1) 99 99.9)
local i = 1
// display "Percentage from bottom"		_col(20) "Units per week"
while `i' <= 99	{
			display "`i'%" 		_col(20) r(r`i')
			local i = `i' + 1
			}
if `i' == 100	{
			display "Bottom 99.9%" 	_col(20) r(r`i')
			}
			

* SDDUYP
cd "Z:\IAS STATA files\Young people survey (SN4648)\stata6
use working.dta, replace
gen drinkers = 0 if dal7ut <.
replace drinkers = 1 if dal7ut>0 & dal7ut<.

mean age sex drinkers dal7ut if dal7ut <.
sum age if dal7ut<., det
sum dal7ut, det

* ONS Omnibus
cd "Z:\IAS STATA files\ONS Omnibus 2002\stata6"
use working.dta, replace
gen drinkers = 0 if unityr <.
replace drinkers = 1 if unityr>0 & unityr<.

svyset [pweight=wta]
svy: mean respage respsex drinkers unityr if unityr<. & bacfreq<. & ddfreq<. & respage>=18
svy: mean unityr if unityr<. & bacfreq<. & ddfreq<. & respage>=18
sum respage if unityr<., det
sum unityr, det


******************************************************************************************************************************************************
******************************************************************************************************************************************************
* NDNS 19-64 *****************************************************************************************************************************************
******************************************************************************************************************************************************
******************************************************************************************************************************************************

** NOTE that weight01 is an fweight, not a pweight... DOES THIS RELATE TO NDNS?  DOESNT SEEM TO RELATE TO GHS

/* NOTES:
	VARIABLES:
		- Alcundr is based on alcoholic units from drinking only; alcoholic units from food are excluded (see alcunfd var).
	BIAS:
		- The diary sample (even when weighted) has a bias towards lower alcohol consumption, but this is only very slight (<1% of interview avg.)
		- More importantly, the diary measure has a lower average value than the interview measure - particularly when the correction for shandy
		is used (assuming that shandy is 1/2 strength of beer, rather than full strenght; seems to be an error in their calculations).
	Datasets:
		allalc.dta		This dataset is the person-level dataset used for most analyses
		alcvars_long.dta	This dataset is the day-level dataset, used to look at the distribution of drinking days		
	Missing vars:
		Missing variables are constant throughout all the variables used (i.e. if missing for one, then missing for all) in the diary; diary weight
		takes account of non-response to the diary per se.
	Documentation:
		Look at the main report vol 2 (for alcohol results), and main user guide p119 for interview data on alcohol (p823 for derived var 
		specifications)..  Look at main user guide p845 for derived var specifications for diary data.  Furthermore, to understand the conversion 
		process for the diaries, look at details of MAFF databank in Appendix H of user guide		
	Data cleaning
		Didn't record this process (as done in early 2007 before I had formalised by data prepartion process)
		
* do "E:\Current work\CSR\Data analysis\NDNS data preparation.do"		// This file contains all the data preparation for NDNS; is documented itself
do "E:\Current work\CSR\Data analysis\NDNS interview data preparation.do"		// This does all the work of moving the NDNS data across to GHS


cd "Z:\IAS STATA files\NDNS 19-64\stata6"
use allalc.dta, replace
discard							// This ensures that the user programmes are up-to-date
svyset [pweight=diarywgt]

*** BOOTSTRAPPED ESTIMATES
*	NOTE that the 'mean' command is unchanged by the absolute size of the weight, which means that weighted bootstraps are valid (I think...)
adopath + "G:\Current work\CSR\Data analysis" 			// For using other computers...


* Test commands
aR alcundr respsex 
a0 alcundr respsex 
bR alcundr respsex 
b0 alcundr respsex 
cR alcundr respsex 
c0 alcundr respsex 
dR alcundr respsex respage 
d0 alcundr respsex respage 
local varlist "aM bM dM aN bN dN"
foreach var of local varlist 	{
	`var' alcundr respsex respage
	display in red e(res2)
					}

* Test bootstraps
bootstrap e(res1), reps(10): a0 alcundr respsex
bootstrap e(res1), reps(10): b0 alcundr respsex
bootstrap e(res1), reps(10): c0 alcundr respsex
bootstrap e(res1), reps(10): d0 alcundr respsex respage
bootstrap e(res1), reps(10): aR alcundr respsex
bootstrap e(res1), reps(10): bR alcundr respsex
bootstrap e(res1), reps(10): cR alcundr respsex
bootstrap e(res1), reps(10): dR alcundr respsex respage

* BCA full boostraps
* bootstrap diff=(_b[alcundr] - _b[uni_c0]), reps(5) bca: mean drating uni_c0		// This is how to do it in a single line
set more off
set seed 3957575								// This is a random seed got from someone else's program - but it works fine for me!
local varlist "a0 b0 c0 d0 aR bR cR dR"
foreach var of local varlist	{
	bootstrap e(res1), bca reps(2000): `var' alcundr respsex respage
	matrix r1_`var' = e(b)
	matrix r2_`var' = e(ci_bca)
					}
local varlist "a0 b0 c0 d0 aR bR cR dR"
foreach var of local varlist	{
	display "`var'" 	_col(20) %5.2f r1_`var'[1,1]		_col(40) %5.2f r2_`var'[1,1] ", " %5.2f r2_`var'[2,1]
					}
* To check how many people are affected by each one
svy: proportion uni_aRf uni_bRf uni_cRf uni_dRf uni_a0f uni_b0f uni_c0f uni_d0f



******************************************************************************************************************************************************
******************************************************************************************************************************************************
* GHS WEEKLY DATA ************************************************************************************************************************************
******************************************************************************************************************************************************
******************************************************************************************************************************************************


/* MISSING VALUES:
	1. In general the dataset already has correctly set data to missing - e.g. drating is set to 0 for abstainers and missing for those who refused
	to answer about their drinking.  23 people refused to answer the initial drinking questions, which is a very low item refusal rate.

	2. All those who said they never drank (in the two questions drinknow and drinkany) were not asked how much they drank.  BUT those who said they 
	drank occasionally were asked (as you would hope).  

	3. 16-17 year olds were offered a self-completion option, but they seemed to have been asked with their parents roughly present, so under-reporting
	would seem likely.  The data is therefore restricted to those 18 and over only.  There may still be some residual under-reporting from asking adults 
	about their drinking with their family present.		

	4. GHS uses a two-stage sampling design: primary sampling units are postcode sectors, stratified according to region initially (London split into
	four - NE etc.).   The DEFT according to the 2000 report (p192) shows that the alcohol estimates are not subject to a large DEFT, typically
	inflating standard errors by 10% or less for both men and women	*/


*** MAIN WEEKLY RESULTS FROM PREPARATION FILE
cd "Z:\IAS STATA files\GHS 2000-1\stata8"
* do "GHS data preparation.do"
use working.dta, replace

svyset _n [pweight=weight00]						// weight00 appears to be an fweight, but works acceptably as an fweight here
* svyset govreggb [pweight=weight00]				// My attempt on mirroring a complex sample design...

* Test commands
cR drating sex if age>=18 & age<65
c0 drating sex if age>=18 & age<65
dR drating sex age if age>=18 & age<65
d0 drating sex age if age>=18 & age<65
cR drating sex if age>=18
c0 drating sex if age>=18
dR drating sex age if age>=18
d0 drating sex age if age>=18

* Test commands with one trimmed value
cR dratingT sex if age>=18 & age<65
c0 dratingT sex if age>=18 & age<65
dR dratingT sex age if age>=18 & age<65
d0 dratingT sex age if age>=18 & age<65
cR dratingT sex if age>=18
c0 dratingT sex if age>=18
dR dratingT sex age if age>=18
d0 dratingT sex age if age>=18

* Test bootstraps
bootstrap e(res1), reps(10): cR drating sex if age>=19 & age<65
bootstrap e(res1), reps(10): c0 drating sex if age>=19 & age<65
bootstrap e(res1), reps(10): dR drating sex age if age>=19 & age<65
bootstrap e(res1), reps(10): d0 drating sex age if age>=19 & age<65
bootstrap e(res1), reps(10): cR drating sex if age>=18
bootstrap e(res1), reps(10): c0 drating sex if age>=18
bootstrap e(res1), reps(10): dR drating sex age if age>=18
bootstrap e(res1), reps(10): d0 drating sex age if age>=18

* BCA full boostraps - 19-64 year olds
capture drop _merge
set more off
set seed 3957575								// This is a random seed got from someone else's program - but it works fine for me!
local varlist "cR c0 dR d0"
foreach var of local varlist	{
	bootstrap e(res1), bca reps(2000): `var' drating sex age if age>=19 & age<65
	matrix r1_`var' = e(b)
	matrix r2_`var' = e(ci_bca)
					}
local varlist "c0 d0 cR dR"
foreach var of local varlist	{
	display "`var'" 	_col(20) %5.2f r1_`var'[1,1]		_col(40) %5.2f r2_`var'[1,1] ", " %5.2f r2_`var'[2,1]
					}
* To check how many people are affected by each one
svy: proportion uni_cRf uni_dRf uni_c0f uni_d0f if age>=19 & age<65



* FULL AGE RANGE
capture drop _merge
local varlist "c0 d0 cR dR"
foreach var of local varlist	{
	bootstrap e(res1), bca reps(2000): `var' drating sex age if age>=18 
	matrix r3_`var' = e(b)
	matrix r4_`var' = e(ci_bca)
					}
local varlist "c0 d0 cR dR"
foreach var of local varlist	{
	display "`var'" 	_col(20) %5.1f r3_`var'[1,1]		_col(40) %5.1f r4_`var'[1,1] ", " %5.1f r4_`var'[2,1]
					}

* To check how many people are affected by each one
svy: proportion uni_cRf uni_dRf uni_c0f uni_d0f if age>=18 


*** CONVERSION FACTORS: this was an attempt at recalculating GHS data with new figures, but problem of deriving own vars that match supplied DVs...
/*	A slight calculation error by original GHS team (possibly to match other data), in that pint-equivalents of strong beer bottles (sbrpint) are 
	based on size of bottle, but take no account of extra strength of sbeer over nbeer.  This has been changed, therefore doesn't exactly match	*/

do "GHS conversion factors.do"		//  	This applies the revised way of estimating consumption to the original data
	* FOR ORIGINAL ESTIMATES:
		replace drating = drating_o
	* FOR REVISED ESTIMATES:
		replace drating = drating_i
*/



******************************************************************************************************************************************************
******************************************************************************************************************************************************
**** UNDER-18s  **************************************************************************************************************************************
******************************************************************************************************************************************************
******************************************************************************************************************************************************

/* MISSING VALUES: 
	1. FOR YP SURVEY: 537 missing values for dal7ut, of which 90 are for those refusing alevr (1st q), 12 not answering alfrew (2nd q), 35 not allast,
	etc.  But can't tell between refusals and those who couldn't remember; basically just one missing value is coded 	
	2. FOR GHS: 6 (of 238) missing at age 16; 5 (of 202) missing at age 17.  Compared to overall missing of 45 (of 15393 full interviews), this seems
	to suggest no particular bias at these ages.  
  NUMBERS OF VALUES:
	YP SURVEY: large n for 11-15 year olds (over 1,500), but n=294 for age 16 - and presumably these are relatively young ones in same school year as 15yos
	GHS: n=275 for age 16, n=267 for age 17		*/

cd "Z:\IAS STATA files\Young people survey (SN4648)\stata6
/* 	use sdd01data.dta, replace
	keep sn-age alevr-al7ppbt dal7br-dalfrq8
	mvdecode dal7ut, mv (-9)
	save working.dta, replace		*/
use working.dta, replace
mean dal7ut if age<16, over(age)
matrix define ypmean=e(b)
matrix define ypvar=e(V)					//  Each of these needs square-rooting in 1,1; 2,2 etc. to get the std. error

******************************************************************************************************************************************************
* TO GHS DATA
cd "Z:\IAS STATA files\GHS 2000-1\stata8"
use working.dta, replace
svyset _n [pweight=weight00], vce(linearized)
svy: mean drating if age<18, over(age)
matrix define tnmean=e(b)
matrix define tnvar=e(V)

* THIS JUST PRESENTS THE RESULTS INTO AN EASY TO USE FORMAT
global CI = -invnormal(0.025)
display $CI
local i=1
while `i' < 9	{
			if `i' < 7	{
					display `i'+9 " year olds" 										///
						_col(20) %4.2f ypmean[1,`i']									///
						_col(40) %4.2f ypmean[1,`i']-$CI*sqrt(ypvar[`i',`i']) ", "				///
						%4.2f ypmean[1,`i']+$CI*sqrt(ypvar[`i',`i'])
					}
			else		{
					display `i'+9 " year olds" 										///
						_col(20) %4.2f tnmean[1,`i'-6]								///
						_col(40) %4.2f tnmean[1,`i'-6]-$CI*sqrt(ypvar[`i'-6,`i'-6]) ", "			///
						%4.2f tnmean[1,`i'-6]+$CI*sqrt(ypvar[`i'-6,`i'-6])
					}
			local i = `i' + 1
			}
svy: mean drating if age>=18			// Adult consumption to compare this to



******************************************************************************************************************************************************
******************************************************************************************************************************************************
**** DRINK-DRIVING  DATA CHECKS AND IMPUTATION *******************************************************************************************************
******************************************************************************************************************************************************
******************************************************************************************************************************************************


/*	Notes on the ONS dataset:
		DRINK-DRIVING VARS: m303_11 to m303_18 (for drink-driving), plus the created variables alcfreq, drvfreq, ddfreq, bacfreq.  For these, 
		0.375 = once or twice a month, 0.115 = once every couple of months, and 0.29 = once or twice in the past 12 mths.
		12MTH DRINKING VARS: m192_1 to droften
		WEEK DRINKING VARS: m192_20-mi192_25 [on how many days], m192_54-BSpecSh					
	All the sample answered the drinking section, but only about half answered the drink-driving section (cf. the _merge variable)	*/

* INCONSISTENT ANSWERS AND MISSING DATA - this is all done in the ONS data prep file, although additional cleaning is done below
/* MISSING VARS: 
	1. Those coded as 97 in the amount files (meaning 'other amount') are covered in data cleaning; basically all reasonable estimates are
	made, and only those that are completely incomrepehensible AND look significant (i.e. relatively often) are down as missing in the 
	derived vars (unityr).  
	2. 2 people refused to answer the alcohol section at all (m192_1==8); they are put as missing (.r) for droften, unityr, abst.  Because .r is used, 
	all if expressions have to be <. rather than ~=.
	3. 1 person answered m192_1 as yes (i.e. they are a drinker), but then refused to answer any more alcohol questions (e.g. stbrew==98); they 
	are also put as .r in unityr (but not abst)
	4. Some people put 'don't know' (99) as their answers to some of the alcohol questions; these values are simply missing for unityr.		*/

/*  ABSTAINERS: need differential classification, as clearly they were not asked drink-driving questions
	1. Those people who are abstainers are coded as ABST==1.  This is defined as those who said they never drank (from q m192_2, which is filtered from 
	m192_1), OR  those who said they did drink (and were therefore asked more detailed questions) but reported they hadn't drunk at all in the 
	last year (i.e. droften was 8).  		*/

/*  DRINK-DRIVING: 
	1. Missing values inc. don't knows and refusals (.r, .d and .) for the drink-driving vars arise from the original questions.  However, 
	this is, corrected for routing where it is assumed the answer is 0 (e.g. non-drinkers have 0 imputed for dd freq).
	2. Those refusing to answer a question but also assumed to answer 0 were initially imputed (i.e. those people who don't drink).  However, this 
	runs the risk of increasing the level of non-response bias, so (at a later stage in the prep file) is turned back into non-response.
	3. Unlike for the other variables, non-response bias here may be a large problem.  Those refusing to answer the section on drink-driving (or some
	questions within it) have n=18, compared to n=21 for bacfreq>1-2 every couple of months.	
	4. 3 people didn't know the answers to m303_16, and 1 didn't to m303_1, which is problematic for the estimate in method 4		*/



cd "Z:\IAS STATA files\ONS Omnibus 2002\stata6"
use working.dta, replace
notes
do "E:\Current work\CSR\Data analysis\ONS data prep.do"		// This file contains all the data preparation for ONS; is documented itself



*** COMPARING DATA OUTPUT TO OFFICIAL REPORTS: see Excel ssheet tab on 'ONS checks' for details here
tab freq12m [fweight=fw]			// This compares frequency
capture drop unitgrp				// This is for comparing volume groups
recode unityr (-1=1 "Abstainer")(0/1=2 "Less than 1 unit") (1/10=3 "1-10 units") (11/21=4 "11-21 units")(22/35=5 "22-35 units") ///
			(36/50=6 "36-50 units")(nonmis=7 "51+ units"), gen(unitgrp)
replace unitgrp=1 if abst==1
bysort respsex: tab unitgrp [fweight=fw]
bysort respsex: tab unitgrp 
quietly svyset _n [pweight=wta], vce(linearized)
svy, vce(linearized): tabulate unitgrp respsex, count column percent



***  LOGICALLY INCONSISTENT ANSWERS:
capture drop imposs* _imposs
gen imposs1=drvfreq<ddfreq if drvfreq<. & ddfreq<.			// n=5 for this impossible answer
gen imposs2=alcfreq<ddfreq if alcfreq<. & ddfreq<.			// n=16 for this impossible answer
gen imposs3=ddfreq<bacfreq if ddfreq<. & bacfreq<.			// n=6 for this impossible answer
gen _imposs=(imposs1==1|imposs2==1|imposs3==1)
/*	These are still included in the analysis, as they include a disproportionate number of those reporting high dd frequencies (by their nature).  For
	example, there are 4 people who report dd every day who give logically inconsistent answers, but 12 who give some kind of inconsistent answer!  Hence
	these must be included really, but note the problems of this in the report (see note in Word doc).			*/




******************************************************************************************************************************************************
******************************************************************************************************************************************************
**** DRINK-DRIVING RESULTS ***************************************************************************************************************************
******************************************************************************************************************************************************
******************************************************************************************************************************************************

/* 	Note here that the first few questions (up to m303_13) are about frequencies of drink-driving, while the later questions (m303_14 to m303_18) are 
	about the last time the person drove when they thought they were over the legal limit		*/

cd "Z:\IAS STATA files\ONS Omnibus 2002\stata6"
use working.dta, replace

global if "if ddfreq<. & unityr<."
mvdecode m303_12, mv (99=.a)
svyset [pweight=wta]

/* 	For imputed data (for missing variables only), add:	
		replace ddfreq=ddfreq_i
		replace bacfreq=bacfreq_i
		replace m303_12=m303_12i
	For non-imputed data, add:
		replace ddfreq=ddfreq_o
		replace bacfreq=bacfreq_o
		replace m303_12=m303_12o
	*/


* Test commands
e1 unityr ddfreq if ddfreq<. & unityr<. & alcfreq<.
e1 unityr bacfreq if bacfreq<. & unityr<. & alcfreq<.
e2 unityr ddfreq if ddfreq<. & unityr<. & alcfreq<.
e2 unityr bacfreq if bacfreq<. & unityr<. & alcfreq<.

* Test bootstraps
bootstrap e(res1), reps(10): e1 unityr ddfreq if ddfreq<. & unityr<. & alcfreq<.
bootstrap e(res1), reps(10): e1 unityr bacfreq if bacfreq<. & unityr<. & alcfreq<.
bootstrap e(res1), reps(10): e2 unityr ddfreq if ddfreq<. & unityr<. & alcfreq<.
bootstrap e(res1), reps(10): e2 unityr bacfreq if bacfreq<. & unityr<. & alcfreq<.

* BCA bootstraps
capture drop _merge
set more off
set seed 3957575								// This is a random seed got from someone else's program - but it works fine for me!
local varlist "e1 e2"
foreach var of local varlist	{
	bootstrap e(res1), bca reps(2000): `var' unityr ddfreq if ddfreq<. & unityr<. & alcfreq<.
	matrix r1_`var' = e(b)
	matrix r2_`var' = e(ci_bca)
					}
svy: proportion uni_e1f uni_e2f

foreach var of local varlist	{
	bootstrap e(res1), bca reps(2000): `var' unityr bacfreq if bacfreq<. & unityr<. & alcfreq<.
	matrix r3_`var' = e(b)
	matrix r4_`var' = e(ci_bca)
	svy: proportion uni_`var'f
					}

local varlist "e1 e2"
foreach var of local varlist	{
	display "`var' ddfreq" 		_col(20) %5.1f r1_`var'[1,1]		_col(40) %5.1f r2_`var'[1,1] ", " %5.1f r2_`var'[2,1]
	display "`var' bacfreq" 	_col(20) %5.1f r3_`var'[1,1]		_col(40) %5.1f r4_`var'[1,1] ", " %5.1f r4_`var'[2,1]
					}

